import pandas as pd
import numpy as np
from datetime import datetime as dt
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from lime.lime_tabular import LimeTabularExplainer
cols = [
"age",
"sex",
"country",
"date_onset_symptoms",
"date_admission_hospital",
"date_confirmation",
"symptoms",
"chronic_disease",
"outcome",
"date_death_or_discharge"
]
data_raw = pd.read_csv("data/covid19.csv", usecols=cols)
print(data_raw.shape)
data_raw.head(3)
symptoms_list = []
for symp in data_raw.symptoms:
if not pd.isna(symp):
symptoms_list += [s.strip() for s in symp.split(",")]
symptoms_raw = pd.Series(symptoms_list).map(lambda x: x.lower())
symptoms_counts = symptoms_raw.reset_index().rename(columns={0: "symptom"}).groupby("symptom").count().reset_index().rename(columns={"index": "cnt"})
signif_counts = symptoms_counts.query("cnt > 4").reset_index(drop=True)
# Greater than 4 cases.
signif_symptoms_list = [
"asymptomatic", "bone pain", "chest tightness", "chills", "cough", "coughing", "diarrhea", "discomfort", "dyspnea",
"fatigue", "fever", "headache", "joint pain", "malaise", "myalgia", "nausea", "phlegm", "pneumonia", "pneumonitis",
"runny nose", "shortness of breath", "sneezing", "sore throat", "sputum"
]
signif_symptoms = dict(
[("MuscleSoreness", "soreness"), ("Weakness", "weak")] +\
[(s.title().replace(" ", ""), s) for s in signif_symptoms_list]
)
def map_outcome(x):
if pd.isna(x):
return 0
return int(x in ["died", "death"])
def map_age(x):
try:
x = float(x)
except ValueError:
if x[0] == x[-2] and x[1] == "0" and x[-1] == "9":
x = int(x[:2])
else:
return np.nan
return x // 10
def map_sex(x):
if pd.isna(x):
return np.nan
return int(x == "male")
def days_diff(x, y, date_format="%d.%m.%Y"):
try:
x = dt.strptime(x, date_format)
y = dt.strptime(y, date_format)
except:
return np.nan
return (x - y).days
data = pd.DataFrame(columns=[
"HasDied",
"Age",
"Sex",
"Country",
"DaysInHospital",
"DaysBeforeHospitalization",
"DaysBeforeConfirmation"
])
country_dict = dict([(c, np.nan if pd.isna(c) else i) for i, c in enumerate(data_raw.country.unique())])
for i in range(data_raw.shape[0]):
d = {}
row = data_raw.loc[i, :]
d["HasDied"] = map_outcome(row.outcome)
d["Age"] = map_age(row.age) # Age i: corresponds to (10 * i, 10 * (i + 1) - 1)
d["Sex"] = map_sex(row.sex) # 1 - man, 0 - woman
d["Country"] = country_dict.get(row.country)
d["DaysInHospital"] = days_diff(row.date_death_or_discharge, row.date_admission_hospital)
d["DaysBeforeHospitalization"] = days_diff(row.date_admission_hospital, row.date_onset_symptoms)
d["DaysBeforeConfirmation"] = days_diff(row.date_confirmation, row.date_onset_symptoms)
d["DaysAfterConfirmation"] = days_diff(row.date_death_or_discharge, row.date_confirmation) # Days as confirmend affected
# Symptoms
for k, v in signif_symptoms.items():
d[k] = np.nan if pd.isna(row.symptoms) else int(row.symptoms.find(v) != -1)
data = data.append(d, ignore_index=True)
data.head(5)
feature_names = data.columns[1:]
# Only records with outcome value not missing.
has_outcome = ~data_raw.outcome.isna()
X_train, X_test, y_train, y_test = train_test_split(
np.array(data[has_outcome].dropna(thresh=3).iloc[:, 1:]),
np.array(data[has_outcome].dropna(thresh=3).iloc[:, 0]),
test_size=0.3,
random_state=58
)
# Mapping missing values to -9999
X_train[np.isnan(X_train)] = -9999
X_test[np.isnan(X_test)] = -9999
model = CatBoostClassifier(random_seed=58)
model.fit(X_train, y_train, verbose=False)
# The classes are extremely unbalanced, hence its a good thing if the classfier predicted literally anything to be in class 1.
pred_labels = model.predict(X_test)
(pred_labels == 1).sum(), (y_test == 1).sum()
# Accuracy
(pred_labels == y_test).sum() / len(y_test)
# F1 score
f1_score(pred_labels, y_test)
np.where(np.logical_and(pred_labels == 1, y_test == 1))
obs = X_test[43, :]
pd.DataFrame(obs, feature_names, columns=["Value"])
# Model predicts 1 - death of the patient.
model.predict(obs)
explainer = LimeTabularExplainer(X_train, class_names=["survival", "death"], feature_names=feature_names, discretize_continuous=False)
explaination = explainer.explain_instance(obs, model.predict_proba)
explaination.show_in_notebook(show_all=False)
explainer.explain_instance(X_test[31, :], model.predict_proba).show_in_notebook(show_all=False)
explainer.explain_instance(X_test[1, :], model.predict_proba).show_in_notebook(show_all=False)
explainer.explain_instance(X_test[42, :], model.predict_proba).show_in_notebook(show_all=False)
The explainations appear to be very stable. Each of the sampled observations had more or less the same decompositions - lead by Age and Country and with other variables with little to no impact.
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
def compare_attribution(i):
explainer.explain_instance(X_test[i, :], model.predict_proba).show_in_notebook(show_all=False)
explainer.explain_instance(X_test[i, :], rf_model.predict_proba).show_in_notebook(show_all=False)
compare_attribution(3)
compare_attribution(19)
compare_attribution(43)
The Random Forest classifier (the bottom summary for each observation) seems to have more balanced attribution of Age and Country variables than the Catboost classifier (the top summary).